This report is a guidebook with selected Q&A aims at finding out what makes women happy.

The whole dataset: HappyDB is a corpus of 100,000 crowd-sourced happy moments via Amazon’s Mechanical Turk. You can read more about it on https://arxiv.org/abs/1801.07746. But I only use a subset of this dataset to conduct data analysis in this report.

Step 0 - Load all the required libraries

library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud)
library(gridExtra)
library(ngram)
library(shiny) 
library(tm)

Step 1 - Load the data to be cleaned and processed

urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv'
hm_data <- read_csv(urlfile)

Step 2 - Initial Text Data Cleaning

corpus <- VCorpus(VectorSource(hm_data$cleaned_hm))%>%
  tm_map(content_transformer(tolower))%>%
  tm_map(removePunctuation)%>%
  tm_map(removeNumbers)%>%
  tm_map(removeWords, character(0))%>%
  tm_map(stripWhitespace)

Step 3 - Applying stemming to words and transforming a text mining (tm) object into a tidy data object

stemmed <- tm_map(corpus, stemDocument) %>%
  tidy() %>%
  select(text)

Step 4 - Preparing a tidy format of the dictionary for stem completion.

dict <- tidy(corpus) %>%
  select(text) %>%
  unnest_tokens(dictionary, text)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs

Step 5 - Eliminating stopwords that do not convey valuable information for our dataset

data("stop_words")

word <- c("happy","ago","yesterday","lot","today","months","month",
                 "happier","happiest","last","week","past")

stop_words <- stop_words %>%
  bind_rows(mutate(tibble(word), lexicon = "updated"))

Step 6 - Merge the stem data and dictionary into a single tibble

completed <- stemmed %>%
  mutate(id = row_number()) %>%
  unnest_tokens(stems, text) %>%
  bind_cols(dict) %>%
  anti_join(stop_words, by = c("dictionary" = "word"))
## Warning: Outer names are only allowed for unnamed scalar atomic inputs

Step 7 - Conclude the stem selection process by choosing the word with the highest frequency associated with each stem

completed <- completed %>%
  group_by(stems) %>%
  count(dictionary) %>%
  mutate(word = dictionary[which.max(n)]) %>%
  ungroup() %>%
  select(stems, word) %>%
  distinct() %>%
  right_join(completed) %>%
  select(-stems)

Step 8 - Combining stemmed individual words with their corresponding happy moments.

completed <- completed %>%
  group_by(id) %>%
  summarise(text = str_c(word, collapse = " ")) %>%
  ungroup()

Step 9 - Keeping a track of the happy moments with their own ID

hm_data <- hm_data %>%
  mutate(id = row_number()) %>%
  inner_join(completed)

datatable(hm_data)

Step 10 - Exporting the processed text data into a CSV file

write_csv(hm_data, "../output/processed_moments.csv")